library("SummarizedExperiment")
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
library("readODS")
library("scatterD3")
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library (DESeq2)
DESeq2Normalization <- function(sExpr, iqr.cutoff=0.1, plot=c("2d", "3d")) {
###########################################################################
## PART 0. Check function arguments
###########################################################################
#fun.main <- deparse(match.call()[[1]])
# .stopIfNotExpressionSetOrSummarizedExperiment(inputObj, 'inputObj', fun.main)
# .stopIfNotNumeric0to1(iqr.cutoff, 'min.diff.cutoff', fun.main)
# sExpr <- .tryMakeSummarizedExperimentFromExpressionSet(inputObj)
###########################################################################
## PART I. Filter samples according to phenoData
###########################################################################
## o phenoData table contains which samples should be used in the analysis
## o the samples which should be used in the analysis will have an
## assigned category sExpr@phenoData$category, as "standard" or "test"
## o non-assigned samples with NA values will be ignored
## o NOTE
## assigning NA values to samples is an easy way to eliminate samples
## from the analysis, without having to remove them from all input tables
## (eg removing from sExpr, pdata, calls)
phenoData <- colData(sExpr)
## filter out samples with missing category and/or general cell type
phenoData.sel <- .filterPheno(phenoData, fun.main, "na")
############################################################################
## PART II. Filter genes in dataset
############################################################################
## A. For the cosine scoring, keep only the samples in the selected category
## (either 'standard' or 'test')
#if (sum(phenoData.sel) == 0)
# stop(paste("No samples in selected category found, exiting function",
# fun.main))
sExpr <- sExpr[, rownames(phenoData.sel)]
# remove genes without any counts
sExpr <- sExpr[rowSums(assay(sExpr, "counts")) > 0 , ]
# Create DESeq.ds from the summarizedExperiment object
DESeq.ds <- DESeqDataSetFromMatrix(countData = (assay(sExpr, "counts") + 1),
colData = colData(sExpr),
rowData = rowData(sExpr),
design = ~cell_type)
# DESeq2 Default normalization method
DESeq.dsDefault <- estimateSizeFactors(DESeq.ds)
counts.sf_normalized <- counts(DESeq.dsDefault, normalized = TRUE)
log.norm.counts <- log2(counts.sf_normalized + 1)
log.norm.counts
}
#TPMNormalization <- function(sExpr, iqr.cutoff=0.1, plot=c("2d", "3d")) {
# ###########################################################################
# ## PART 0. Check function arguments
# ###########################################################################
# #fun.main <- deparse(match.call()[[1]])
# # .stopIfNotExpressionSetOrSummarizedExperiment(inputObj, 'inputObj', fun.main)
# # .stopIfNotNumeric0to1(iqr.cutoff, 'min.diff.cutoff', fun.main)
# # sExpr <- .tryMakeSummarizedExperimentFromExpressionSet(inputObj)
#
# ###########################################################################
# ## PART I. Filter samples according to phenoData
# ###########################################################################
# ## o phenoData table contains which samples should be used in the analysis
# ## o the samples which should be used in the analysis will have an
# ## assigned category sExpr@phenoData$category, as "standard" or "test"
# ## o non-assigned samples with NA values will be ignored
# ## o NOTE
# ## assigning NA values to samples is an easy way to eliminate samples
# ## from the analysis, without having to remove them from all input tables
# ## (eg removing from sExpr, pdata, calls)
# phenoData <- colData(sExpr)
# ## filter out samples with missing category and/or general cell type
# phenoData.sel <- .filterPheno(phenoData, fun.main, "na")
#
# ############################################################################
# ## PART II. Filter genes in dataset
# ############################################################################
# ## A. For the cosine scoring, keep only the samples in the selected category
# ## (either 'standard' or 'test')
#
# #if (sum(phenoData.sel) == 0)
# # stop(paste("No samples in selected category found, exiting function",
# # fun.main))
#
# sExpr <- sExpr[, rownames(phenoData.sel)]
#
# # remove genes without any counts
# sExpr <- sExpr[rowSums(assay(sExpr, "counts")) > 0 , ]
#
# #
#}
PcaPlot <- function(normalizedCount, pdata, colorVar,
symbolVar, plot = c("3d", "2d")) {
############################################################################
## PART III. PCA Analysis of the filtered data
############################################################################
## 1. Euclidean distance between categories
## 2. Cosine similarity between categories
## -DONT USE-3. cosine "score" between categories
## o 0 < cosine score <1 where range is set to 1-min(cosine.similarity)
## What is this good for?
## a. Visualization and exploratory data analysis
## b. Generates values to be used for cell scoring
## Remove any rows that have zero variance
pca <- prcomp(t(normalizedCount), scale=TRUE)
## Get proportion of variance explained by PC1 and PC2
pca.sum <- summary(pca)
pc1 <- pca.sum$importance[2,1] * 100
pc2 <- pca.sum$importance[2,2] * 100
pc3 <- pca.sum$importance[2,3] * 100
print(pc1)
print(pc2)
print(pc3)
## Extract coordinates from pca object
pca.comp <- as.data.frame(pca$x[, c(1, 2, 3)])
pca.comp <- cbind(pca.comp,
cell_type = pdata[, "cell_type"],
category = pdata[, "category"])
tooltips <- paste(" <strong>", rownames(pca.comp),"</strong><br />", pca.comp$cell_type)
if (plot == "2d") {
## 2D plot
scatterD3(data = pca.comp, x=PC1, y=PC2, tooltip_text = tooltips,
col_var=cell_type, symbol_var=category)
} else {
## 3D plot
fig <- plot_ly(pca.comp, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cell_type, symbol = ~category, symbols = c('circle','x','o'))
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = paste('PC1')),
yaxis = list(title = ('PC2')),
zaxis = list(title = ('PC3'))))
fig
}
}
################################################################################
## FUNCTION: .filterPheno
################################################################################
#' From utils.R in CellScore
#' Filters samples from phenotype data with missing info
#' @param pheno a DataFrame with phenotype descriptions
#' @param calling.fun a character - name of the parent function calling this
#' function
#' @param flag a character - type of filter, one of the following: 'na',
#' 'group', subgroup', anygroup' (the last option checks for any match in
#' group or subgroup)
#' @param flag.value a character - cell name needed for the 'specifc' filter
#' @return filtered data.frame
#' @keywords filter
.filterPheno <- function(pheno, calling.fun,
flag = c("na", "group", "subgroup", "anygroup"),
flag.value=NULL){
## If filtering by (sub)groups, the flag.value must be specified
stopifnot(flag == "na" || !is.null(flag.value))
sel <- switch(flag,
na = !is.na(pheno$category) & !is.na(pheno$general_cell_type),
group = pheno$general_cell_type %in% flag.value,
subgroup = pheno$sub_cell_type1 %in% flag.value,
anygroup = pheno$general_cell_type %in% flag.value |
pheno$sub_cell_type1 %in% flag.value)
if (sum(sel) == 0){
print(sel)
stop(paste("No samples left after phenotype filtering, exiting function",
calling.fun))
}
pheno[sel, ]
}
# read the file selectedDEE2SExpr.rds that stores SummarizedExperiment object
# of the count data.
selectedDEE2SExpr <- readRDS("../selectedDEE2SExpr.rds")
# normalize the count data
normalizedCount <- DESeq2Normalization(selectedDEE2SExpr)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
#
PcaPlot(normalizedCount, colData(selectedDEE2SExpr), plot="2d")
## [1] 21.015
## [1] 7.024
## [1] 5.963
PcaPlot(normalizedCount, colData(selectedDEE2SExpr), plot="3d")
## [1] 21.015
## [1] 7.024
## [1] 5.963
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors